Pre-processing

  1. Adapters were trimmed using cutadapt v1.16
  2. Gene expression was quantified using salmon v1.3.0
  3. TPMs were obtained for the genes using tximport 1.20.0
  4. Sample HTB314 was removed from this analysis due to its size factor being very different from others and leading to abberant results.
library(dplyr)
library(ggplot2)
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)
library(limma)

##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics

Summary of Data Metrics
Sample patient reads %Q30 Duplication rate % reads with adapter STAR alignment number percent aligned splices annotated salmon mapping
Bulk MDS268 31441538 93 24 2.3 25549835 81.26 5546270 58.2075
Bulk MDS280 28794421 93 24 2.3 23317205 80.98 5115546 58.8614
CD123+ MDS268 32307881 93 27 3.3 26243441 81.23 7290861 64.2150
CD123+ MDS280 23745205 93 34 2.6 21035358 88.59 5668653 71.3245
CD123- MDS268 27917999 93 26 2.7 22835189 81.79 5793322 63.9284
CD123- MDS280 24767573 93 32 2.6 22467937 90.72 4773823 62.7193

##This section of code will import and format the data for DeSeq2

#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon-314/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon-314//"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon-314/")

#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))

# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file

# Extract counts only
counts <- txi.salmon.g$counts %>%
  as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)

##for clients the counts and tpm files should be written out

metadata<-read_table2("~/Desktop/Jordan files/Counts/sample.txt")
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## Please use `read_table()` instead.
metadata<-filter(metadata, patient %in% c("MDS268", "MDS280"))

##This chunk of code sets an expression threshold for TPM where all samples have at least 5 counts

expressed_genes<-tpms %>% filter_all(all_vars(. > 5))

##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap

PCA plot

exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))

PC1 vs PC2

## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####

coldata<-dplyr::select(metadata, -FileName) 
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
#coldata<-coldata[colnames(txi.salmon.g$counts),]
#all(rownames(coldata) == colnames(txi.salmon.g$counts))

dds <- DESeqDataSetFromTximport(txi.salmon.g,
                              colData = coldata,
                              design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
dds<-estimateSizeFactors(dds)
sf<-as.data.frame(dds$sizeFactor)
sf<-rownames_to_column(sf, var="SampleName")
metadata<-inner_join(metadata, sf, by="SampleName")
names(metadata)[5] <- "sizeFactor"
metadata$sizeFactor <- round(metadata$sizeFactor ,digit=2)
### unfiltered data
dds.unfiltered <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)

##filtered data changed this from dds[rowMins(counts(dds))>5,] to a less stringent filtering
keep<-rowSums(counts(dds))>=10
#dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-dds[keep,]
dds.filtered<-DESeq(dds.filtered)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)

###outcome the first filtering cut from 60000 genese to 14000 genes with the new filter 35000genes are left which is about 1/2
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect


res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))

res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))

## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 606
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1608
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1165
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1332
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2720
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2051
sum(res_123posvs123neg_filtered$padj < 0.05, na.rm=TRUE)
## [1] 564
sum(res_123posvs123neg_unfiltered$padj < 0.05, na.rm=TRUE)
## [1] 153

Volcano Plot

## Warning: Removed 8077 rows containing missing values (geom_point).

## Warning: Removed 38239 rows containing missing values (geom_point).

## Warning: Removed 38976 rows containing missing values (geom_point).

## Warning: Removed 8077 rows containing missing values (geom_point).

## Warning: Removed 38976 rows containing missing values (geom_point).

## Warning: Removed 8077 rows containing missing values (geom_point).

###MA plots

pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar2 <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
  mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType,percent=metadata$percentrRNA)
## Warning: Unknown or uninitialised column: `percentrRNA`.
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")

ggplot(pca_df, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
  xlab(paste0("PC1: ",round(percentVar2$percentVar[1] * 100), "% variance")) + 
  ylab(paste0("PC2: ", round(percentVar2$percentVar[2] * 100), "% variance"))+
  scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
  ggtitle("PCA of analyzed data")

pca_data2<-vst(dds.filtered, blind=T)
ntop=500
rv <- rowVars(assay(pca_data2))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca2 <- prcomp(t(assay(pca_data2)[select,]))
percentVar4 <- data.frame(percentVar=pca2$sdev^2/sum(pca2$sdev^2))%>%
  mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df2<-pca2$x%>%data.frame()%>%mutate(type=metadata$cellType, percent=metadata$percentrRNA)
## Warning: Unknown or uninitialised column: `percentrRNA`.
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")

ggplot(pca_df2, aes(PC1, PC2, color=type, shape=metadata$patient))+geom_point(size=3)+
  xlab(paste0("PC1: ",round(percentVar4$percentVar[1] * 100), "% variance")) + 
  ylab(paste0("PC2: ", round(percentVar4$percentVar[2] * 100), "% variance"))+
  scale_color_manual(values=alt_col_values)+labs(color="")+theme_bw()+
  ggtitle("PCA of analyzed data using filtered data")

##generates a plot of FBX015 a gene in paper up in 123+
d<-plotCounts(dds.filtered, gene="ENSG00000141665",intgroup=c("cellType","patient"), returnData = TRUE)

ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()

##generates a plot of EEFSEC a gene in paper up in 123+
d<-plotCounts(dds.filtered, gene="ENSG00000132394",intgroup=c("cellType","patient"), returnData = TRUE)

ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()

##generates a plot of TGM4 a gene in paper down in 123+
d<-plotCounts(dds, gene="ENSG00000163810",intgroup=c("cellType","patient"), returnData = TRUE)

ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()

##generates a plot of CRLF2 a gene in paper down in 123+
d<-plotCounts(dds, gene="ENSG00000163810",intgroup=c("cellType","patient"), returnData = TRUE)

ggplot(d, aes(x=cellType, y=count, color=patient))+geom_point()

###GSEA analysis

##generate gene names to go with ensembl gene id as the pathways will use gene name
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
ens2gene <- t2g_hs[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)

##loading hallmark pathways and KEGG pathways

pathways.hallmark <- gmtPathways("~/Desktop/Jordan files/h.all.v7.4.symbols.gmt")
pathways.kegg<-gmtPathways("~/Desktop/Jordan files/c2.cp.kegg.v7.4.symbols.gmt")

##generating tidy data and adding it to the results file
##123pos vs 123neg analysis
res_posneg_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"),tidy=TRUE)
colnames(res_posneg_gsea)[1]<-"ensembl_gene_id"
res_posneg_gsea <- inner_join(res_posneg_gsea,ens2gene,by="ensembl_gene_id")
res_posneg_gsea2<-res_posneg_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_posneg<-deframe(res_posneg_gsea2)
fgseaRes_posneg <- fgsea(pathways=pathways.hallmark, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.55% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.55% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_tidy <-fgseaRes_posneg%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_posneg_kegg_tidy <-fgseaRes_posneg_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))

##123 pos vs bulk analysis

res_posbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"),tidy=TRUE)
colnames(res_posbulk_gsea)[1]<-"ensembl_gene_id"
res_posbulk_gsea <- inner_join(res_posbulk_gsea,ens2gene,by="ensembl_gene_id")

res_posbulk_gsea2<-res_posbulk_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_posbulk<-deframe(res_posbulk_gsea2)
fgseaRes_posbulk<- fgsea(pathways=pathways.hallmark, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.82% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.82% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_tidy <-fgseaRes_posbulk%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_posbulk_kegg_tidy <-fgseaRes_posbulk_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))

##123 neg vs bulk analysis
res_negbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"),tidy=TRUE)
colnames(res_negbulk_gsea)[1]<-"ensembl_gene_id"
res_negbulk_gsea <- inner_join(res_negbulk_gsea,ens2gene,by="ensembl_gene_id")


res_negbulk_gsea2<-res_negbulk_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_negbulk<-deframe(res_negbulk_gsea2)
fgseaRes_negbulk<- fgsea(pathways=pathways.hallmark, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.99% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.99% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_tidy <-fgseaRes_negbulk%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_negbulk_kegg_tidy <-fgseaRes_negbulk_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))
fgseaRes_posneg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs 123neg.')
fgseaRes_posneg_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs 123neg.')
fgseaRes_posbulk_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs bulk.')
fgseaRes_posbulk_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs bulk.')
fgseaRes_negbulk_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123neg vs bulk.')
fgseaRes_posneg_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123neg vs bulk.')
ggplot(fgseaRes_posneg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123pos vs 123neg") + 
  theme_minimal()

ggplot(fgseaRes_posneg_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123pos vs 123neg") + 
  theme_minimal()

ggplot(fgseaRes_posbulk_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123pos vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_posbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123pos vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_negbulk_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123neg vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_negbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123neg vs bulk ") + 
  theme_minimal()